{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "- Todos os arquivos CSV possuem cabeçalho com o nome das colunas e campos separados por vírgula “,”.\n",
    "- Coloque a resposta final de cada questão com **dois dígitos após a vírgula.**\n",
    "- **seed 42**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 147,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import sklearn\n",
    "import matplotlib.pyplot as plt\n",
    "\n",
    "from sklearn.cluster import KMeans\n",
    "\n",
    "from sklearn import metrics\n",
    "from scipy.spatial.distance import cdist\n",
    "from sklearn.naive_bayes import GaussianNB\n",
    "from sklearn.neighbors import KNeighborsClassifier\n",
    "from sklearn.model_selection import KFold\n",
    "from sklearn.model_selection import LeaveOneOut\n",
    "from sklearn.linear_model import Ridge\n",
    "from math import sqrt\n",
    "from sklearn.tree import DecisionTreeRegressor\n",
    "import scipy.stats as sp"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 145,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "np.set_printoptions(precision=2,suppress=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'0.23.1'"
      ]
     },
     "execution_count": 27,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pd.__version__"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [],
   "source": [
    "root = \"/home/felipe/python-sandbox/python3/notebooks/test/\""
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### q1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>X1</th>\n",
       "      <th>X2</th>\n",
       "      <th>X3</th>\n",
       "      <th>X4</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>-0.075701</td>\n",
       "      <td>0.470857</td>\n",
       "      <td>0.133139</td>\n",
       "      <td>-0.152900</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>-0.328561</td>\n",
       "      <td>0.427323</td>\n",
       "      <td>-0.181237</td>\n",
       "      <td>-0.173041</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>-0.470727</td>\n",
       "      <td>-0.150515</td>\n",
       "      <td>0.000954</td>\n",
       "      <td>0.070937</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>-0.280037</td>\n",
       "      <td>-0.250587</td>\n",
       "      <td>-0.216587</td>\n",
       "      <td>-0.372430</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>-0.139359</td>\n",
       "      <td>-0.124966</td>\n",
       "      <td>0.081284</td>\n",
       "      <td>0.083914</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "         X1        X2        X3        X4\n",
       "0 -0.075701  0.470857  0.133139 -0.152900\n",
       "1 -0.328561  0.427323 -0.181237 -0.173041\n",
       "2 -0.470727 -0.150515  0.000954  0.070937\n",
       "3 -0.280037 -0.250587 -0.216587 -0.372430\n",
       "4 -0.139359 -0.124966  0.081284  0.083914"
      ]
     },
     "execution_count": 25,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_agrup_q1 = pd.read_csv(root+\"agrupamento_Q1.csv\")\n",
    "df_agrup_q1.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(1000, 4)"
      ]
     },
     "execution_count": 38,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "X = df_agrup_q1.values\n",
    "X.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Unnamed: 0</th>\n",
       "      <th>X1</th>\n",
       "      <th>X2</th>\n",
       "      <th>X3</th>\n",
       "      <th>X4</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>1.488531</td>\n",
       "      <td>-0.430586</td>\n",
       "      <td>0.208942</td>\n",
       "      <td>0.018485</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>0.149391</td>\n",
       "      <td>1.549535</td>\n",
       "      <td>0.219688</td>\n",
       "      <td>0.917854</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>1.276714</td>\n",
       "      <td>0.844500</td>\n",
       "      <td>1.211906</td>\n",
       "      <td>-0.494781</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4</td>\n",
       "      <td>-0.224172</td>\n",
       "      <td>-0.265252</td>\n",
       "      <td>1.004367</td>\n",
       "      <td>-0.209741</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5</td>\n",
       "      <td>0.059576</td>\n",
       "      <td>0.946101</td>\n",
       "      <td>1.114006</td>\n",
       "      <td>0.401051</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   Unnamed: 0        X1        X2        X3        X4\n",
       "0           1  1.488531 -0.430586  0.208942  0.018485\n",
       "1           2  0.149391  1.549535  0.219688  0.917854\n",
       "2           3  1.276714  0.844500  1.211906 -0.494781\n",
       "3           4 -0.224172 -0.265252  1.004367 -0.209741\n",
       "4           5  0.059576  0.946101  1.114006  0.401051"
      ]
     },
     "execution_count": 41,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_agrup_centroides_q1 = pd.read_csv(root+\"agrup_centroides_Q1.csv\")\n",
    "df_agrup_centroides_q1.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 63,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(12, 4)"
      ]
     },
     "execution_count": 63,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "centroids = df_agrup_centroides_q1.values[:,1:]\n",
    "centroids.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 65,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[[ 1.01  0.01 -0.01  0.03]\n",
      " [ 1.    0.   -0.01  1.  ]\n",
      " [ 1.    0.01  1.    0.  ]\n",
      " [-0.03  0.02  0.01 -0.02]\n",
      " [ 0.99  0.99  0.97 -0.  ]]\n"
     ]
    }
   ],
   "source": [
    "kmeans = KMeans(n_clusters=5, random_state=42, init=centroids[:5,:],max_iter=10,n_init=1).fit(X)\n",
    "print(kmeans.cluster_centers_)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 66,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYUAAAEWCAYAAACJ0YulAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMS4wLCBo\ndHRwOi8vbWF0cGxvdGxpYi5vcmcvpW3flQAAIABJREFUeJzt3XmcHFW5//HPNxsJIRBIIpgQCFxA\nCCigE8TLIgGUgBg2jWGRTWT5kYCKN4IsIl7lyuUKl1UBEQUkhOUKYQsok8imJhFQQgDDmoUl7BBI\nQpLn98epmXSG2dPVNdPzfb9e9Zqq7uo6T/XM9NPnnKpzFBGYmZkBdCs6ADMz6zicFMzMrJ6TgpmZ\n1XNSMDOzek4KZmZWz0nBzMzqOSkUSNLZkq6rQDnDJIWkHtn2VEnH5F1uJZTzXCRdI+k/2/G6kLRZ\nOWJo4vi7SHo6r+M3Ul6u59Nekn4o6aqcjv2CpD2beK5dfxedlZNCjiS9X7KskPRhyfahZS7rGklL\nG5T5eDnLaK+SpPRog8cHZjG/0MrjVCSJdjQR8UBEfCqPY3fULwiSdpM0r/SxiPhZRHS4WKuNk0KO\nImKtugV4CfhqyWPX51DkeaVlRsS2OZSxOtaUtE3J9iHA80UFY2Yf56RQvF6SfifpPUmzJNXUPSFp\nsKRbJC2U9Lykk8pY7r9J+pukdyXdJmm9knJHZ7G8nX2T3Cp7/ChJk0v2+5ekm0q250rarpkyrwWO\nKNk+HPhd6Q5NnbOkUcAPgW80UgvaWNJD2Xt4r6SBLZ1L9tz2kv6eve5GoHdTgUvaTNI0Se9Iej3b\nv9Se2fvxtqRLJSl7XTdJZ0h6UdJr2e96ney530o6JVsfktWmTsy2/03Sm9nrV/nWnDV1fF/SP7J4\nbpTUu+T5CZJelrRA0jFNNQdJ+imwC3BJ9p5e0tL5ZK87WtJsSW9JmiJp42bet+be/xcknSbpyexY\nv5HUW1Jf4G5gcEmtd3BpTVEra59HZX93b0k6XtKI7H15u/R8svfzfklvZL+/6yX1byruZs6nn6Ra\nSReVvidVJSK8VGABXgD2bPDY2cBiYB+gO3Au8JfsuW7ATOAsoBewKfAcsFcTx78G+M8mnhsGBNAj\n254KzAe2AfoCtwDXZc9tASwCvgT0BCYAc0pieDuLbTDwIjAve92mwFtAt2bKHwbMzc51OPAUsCfw\nQmvOOXu/rmtw7KnAs1ncfbLt/2rFufTK4v9u9tzXgI+aeQ9vAE7PYuwN7FzyXAB3AP2BjYCFwKjs\nuaOzMjcF1gJuBa4teW5ytn5Idh43ljx3W7a+W937XPK39Lfsd7AeMBs4PntuFPAKsDWwJnBdFt9m\nTZzXVOCYBo81dz77ZeezFdADOAN4uIljN/n+l5zHE8DQ7Dweqnv/G55zw98/K/+mfpn9Pr5M+l/6\nA/AJYAjwGvDFbP/NsjjWAAYBfwYubO7/s+H/FjAge98b/RuplsU1heI9GBF3RcRy0jfpuiafEcCg\niDgnIpZGxHPAlcDYZo71/ewbUt3y22b2vTYinoiIRcCZwBhJ3YFvAHdGxH0R8RFwPunD9t+zGN4D\ntgN2BaYACyRtCXwReCAiVjRT5jzgaVIiODw731LtOWeA30TEMxHxITApi4/mzgXYkfRBdWFEfBQR\nNwPTmynjI2BjYHBELI6IBxs8/18R8XZEvATUlsRwKPCLiHguIt4HTgPGKnX6TwN2ltSN9H6eB+yU\nve6L2fNNuSgiFkTEm8DkkvLGZO/HrIj4gPRB2h5Nnc/xwLkRMTsilgE/A7ZrorbQ3Ptf55KImJud\nx0+Bg9sY50+y38e9pAR0Q0S8FhHzgQeA7QEiYk4Wx5KIWAj8gvQet9Zg0u/jpog4o40xdipOCsV7\npWT9A6B39oGxMan6XP8hT2o+Wb+ZY50fEf1LliOa2XduyfqLpA/IgaysAQCQfcjPJX3zgvSPsRvp\nQ2wa6ZvmF2n5Q6zO74AjSf/8DZNCe84ZPv4erpWtN3cug4H5kX0VzLxI0yYAAv6WNYcc3Z4YsvUe\nwPoR8Szpg2w7UjPOHaQk+ylafj+bK6/0d1u63hZNHX9j4H9Lfj9vkt6XIXxcS39LDeN7MXtNW7xa\nsv5hI9trAUhaX9JESfMlvUuqQQ2k9b5CSmi/bGN8nY6TQsc1F3i+wYd8v4jYp0zHH1qyvhHpm/Dr\nwALSPz4AWbvpUFJzE6xMCrtk69NoW1K4hfQP9lz2LbRUS+fc1iF9mzuXl4EhDdqFN2rqQBHxSkR8\nOyIGA8cBlzXWTt9SDFkZy1j54TWN1HTVK/t2O43U77Iu8Fgrjt/Qy8CGJdtDm9ox09b3dC5wXIPf\nUZ+IeLiRfVv6W2oY30bZa9oTV0t+lh3z0xGxNnAYKZm11pXAPcBdWZ9H1XJS6Lj+Brwn6QeS+kjq\nLmkbSSPKdPzDJA2XtCZwDnBz1oQ1CfiKpD0k9QROAZYAdf/004CRQJ+ImEeqoo8itbc+2rCQhrLm\nqt2Bxi4tbOmcXwWGZc0trdHcuTxC+nA+SVJPSQcCOzR1IElfl1T3YfsW6QOmuaayOjcA35W0iaS1\nSB9ON2ZNL5Dez3GkNm5INa9xpGbF5a08z1KTgKMkbZX9bs9sYf9XSf0drfVL4DRJWwNIWkfS15uJ\npbm/JYATJW2odKHD6UBdB/6rwABlnfJl0A94H3hH0hDgP9pxjHGk5s/JkvqUKa4Ox0mhg8o+EPYl\nNS08T/oWfxXQ3D/JBK16n8Lrzex7LakD7RVSR91JWblPk75FXZyV+VXSpbRLs+efIf1zPZBtv0vq\nDH6otR9iETEjazpp6znXXen0hqS/t6KcJs8lO58DSU1Zb5Lav29t5nAjgL9Keh+4HTg56/NoydWk\n9/rP2TktBsaXPD+N9IFVlxQeJHUQ/5l2iIi7gYtI/QBzgL9kTy1p4iX/C3wtu3rnolYc//+AnwMT\ns2aYJ4C9m9i32b+lzO+Be0l/Q8+SOnSJiKdICfW5rKmqrc1KDf0Y+CzwDnAnzf+uG5U1NR5L6hu7\nTSVXfFUTrdqkambVJLsE9AlgjZLaSYegdNPiMRHxx6JjsZVcUzCrMpIOkLSGpHVJ3+ond7SEYB2X\nk4JZ9TmOdI3+s8By4IRiw7HOxM1HZmZWzzUFMzOr16PoANpq4MCBMWzYsKLDMDPrVGbOnPl6RAxq\nab9OlxSGDRvGjBkzig7DzKxTkdTcHfv13HxkZmb1nBTMzKyek4KZmdVzUjAzs3pOCmZmVq/qk8J5\n50Ft7aqP1damx83MbFW5JgVJoyQ9LWmOpFMbeX5jSX/K5lSdWjI0cdmMGAFjxqxMDLW1aXtEuQag\nNjOrIrndp5BN7XgpaV7UecB0SbdHxJMlu50P/C4ifitpd9Icxd8sZxwjR8KkSXDggTB8ODzzTNoe\nObKcpZiZVYc8awo7AHOyuWmXAhNJk36XGg7cn63XNvJ8WYwcCTvuCA8/DKNHOyGYmTUlz6QwhFXn\nX53Hx+dxfZw00QnAAUA/SQMaHkjSsZJmSJqxcOHCNgdSWwvTp0OvXnD99R/vYzAzs6TojubvA1+U\n9Chpjt/5pKF+VxERV0RETUTUDBrU4tAdq6jrQ7jpJjjuOFi2DL72NScGM7PG5JkU5rPqpNwbsuqE\n3UTEgog4MCK2J83PSkS8Xc4gpk9f2YcwbhwsX56akKZPL2cpZmbVIbf5FCT1AJ4B9iAlg+nAIREx\nq2SfgcCbEbFC0k+B5RFxVnPHrampidUZEG/UKPjnP+GFF6Bnz3YfxsysU5E0MyJqWtovt5pCNv3f\nOGAKMBuYFBGzJJ0jaXS2227A05KeAdYHfppXPHXGjYMFC+DWNk/bbWZW/TrdzGurW1NYvhy22AIG\nD4YHHihjYGZmHVjhNYWOqnt3OPFEePBBeOyxoqMxM+tYulxSADjqKFhzTbj44qIjMTPrWLpkUlh3\nXTjsMPj97+GNN4qOxsys4+iSSQFSh/PixfDrXxcdiZlZx9Flk8KnPw277QaXXZY6n83MrAsnBUi1\nhRdfhMmTi47EzKxj6NJJYb/9YOhQuOSSoiMxM+sYunRS6NEDTjgB/vQnePLJlvc3M6t2XTopABxz\nDKyxhmsLZmbgpMCgQTB2LPzud/DOO0VHY2ZWrC6fFADGj4dFi+Caa4qOxMysWE4KwOc+l2Zmu+QS\nWLGi6GjMzIrjpJAZPx7mzIF77y06EjOz4jgpZL72NVh/fY+HZGZdm5NCplevNF3n3XenGoOZWVfk\npFDiuOPS0NqXXVZ0JGZmxXBSKDF4cGpGuvpqeP/9oqMxM6s8J4UGxo9P9ytcd13RkZiZVZ6TQgNf\n+AJsv326PLWTzVRqZrbanBQakFJtYdYsmDq16GjMzCrLSaERY8fCgAG+PNXMuh4nhUb06ZMGyrvt\nNnjppaKjMTOrHCeFJpxwQvp5+eXFxmFmVklOCk3YeOM0Cc+VV8KHHxYdjZlZZTgpNGPcOHjjDbjx\nxqIjMTOrDCeFZowcCVtvnTqcfXmqmXUFuSYFSaMkPS1pjqRTG3l+I0m1kh6V9A9J++QZT1tJqbbw\n97/DI48UHY2ZWf5ySwqSugOXAnsDw4GDJQ1vsNsZwKSI2B4YC3S4UYcOOwzWWcfTdZpZ15BnTWEH\nYE5EPBcRS4GJwH4N9glg7Wx9HWBBjvG0y1prwVFHwU03wcsvFx2NmVm+8kwKQ4C5JdvzssdKnQ0c\nJmkecBcwvrEDSTpW0gxJMxYuXJhHrM068URYtgx+9auKF21mVlFFdzQfDFwTERsC+wDXSvpYTBFx\nRUTURETNoEGDKh7kZpvB3nunpLB0acWLNzOrmDyTwnxgaMn2htljpb4FTAKIiEeA3sDAHGNqt/Hj\n4ZVX4JZbio7EzCw/eSaF6cDmkjaR1IvUkXx7g31eAvYAkLQVKSlUvn2oFfbaK9UYPB6SmVWz3JJC\nRCwDxgFTgNmkq4xmSTpH0uhst1OAb0t6HLgBODKiY94R0K1b6lt45BGYObPoaMzM8qEO+hncpJqa\nmpgxY0YhZb/zDgwZAl//OvzmN4WEYGbWLpJmRkRNS/sV3dHcqayzDhx+ONxwAxRwEZSZWe6cFNpo\n3DhYsgSuuqroSMzMys9JoY2GD4fdd09Dai9bVnQ0Zmbl5aTQDuPHw9y5cHvDa6nMzDo5J4V2+OpX\nYaONfHmqmVUfJ4V26N4d/t//g6lT4Yknio7GzKx8nBTa6ZhjoHdvj55qZtXFSaGdBgyAQw6Ba6+F\nt94qOhozs/JwUlgN48bBBx/ANdcUHYmZWXk4KayG7beHnXaCSy+FFSuKjsbMbPU5Kaym8ePh2Wfh\n7ruLjsTMbPU5KaymAw+ET37SHc5mVh2cFFZTz55w/PFwzz3wzDNFR2NmtnqcFMrg2GNTcrj00qIj\nMTNbPU4KZbDBBmk47WuugffeKzoaM7P2c1Iok/Hj4d13030LZmadlZNCmXz+81BTkzqcO9m8RWZm\n9ZwUykRKtYXZs+H++4uOxsysfZwUymjMGBg40KOnmlnn5aRQRr17pyuRJk+GF14oOhozs7ZzUiiz\n449PTUmXX150JGZmbeekUGZDh8L++6c5nD/8sOhozMzaxkkhB+PHw5tvwu9/X3QkZmZt46SQg113\nhU9/2penmlnn46SQAynNtfDYY/DQQ0VHY2bWek4KOTn0UOjf35enmlnnkmtSkDRK0tOS5kg6tZHn\nL5D0WLY8I+ntPOOppL594Vvfgltvhfnzi47GzKx1cksKkroDlwJ7A8OBgyUNL90nIr4bEdtFxHbA\nxcCtecVThO7dYdky+NWvVj5WWwvnnVdcTGZmzcmzprADMCcinouIpcBEYL9m9j8YuCHHeCpu1Cjo\n1Ss1IS1ZkhLCmDEwYkTRkZmZNS7PpDAEmFuyPS977GMkbQxsAjQ6apCkYyXNkDRj4cKFZQ80LyNH\nwk9+Am+/nYbWHjMGJk1Kj5uZdUQdpaN5LHBzRCxv7MmIuCIiaiKiZtCgQRUObfV8//tpus7Jk+HI\nI50QzKxjyzMpzAeGlmxvmD3WmLFUWdNRnWnTYPHidJnqxRenJiQzs44qz6QwHdhc0iaSepE++G9v\nuJOkLYF1gUdyjKUQdX0It9wCEyakfoUDDnBiMLOOK7ekEBHLgHHAFGA2MCkiZkk6R9Lokl3HAhMj\nqu/e3+nTV/Yh/OhH8G//li5VffjhoiMzM2tcjzwPHhF3AXc1eOysBttn5xlDkSZMWLnepw9ccQXs\nsYfncTazjqvVNQVJ3SUNlrRR3ZJnYNVo993h6KPh/PPTEBhmZh1Nq5KCpPHAq8B9wJ3ZckeOcVWt\n//5vGDAAvv3tdGObmVlH0tqawsnApyJi64j4dLZ8Js/AqtV668FFF8GMGemnmVlH0tqkMBd4J89A\nupIxY2DffeHMM+H554uOxsxspdZ2ND8HTJV0J7Ck7sGI+EUuUVU5CS67DIYPhxNOgLvvTo+ZmRWt\ntTWFl0j9Cb2AfiWLtdPQoXDuuTBlClx/fdHRmJklasvtAZLWAoiI93OLqAU1NTUxY8aMooovq+XL\nYeed4V//gtmzoZON4GFmnYikmRFR09J+rb36aBtJjwKzgFmSZkraenWD7Oq6d4crr4R334Xvfa/o\naMzMWt98dAXwvYjYOCI2Bk4BrswvrK5jm23g1FPhuutSU5KZWZFamxT6RkT9iD0RMRXom0tEXdAP\nfwif+hQcfzwsWlR0NGbWlbU2KTwn6UxJw7LlDNIVSVYGvXunZqQXXoCzzmpxdzOz3LQ2KRwNDCJN\nl3lrtn50XkF1RbvsAscdBxdemG5sMzMrQpuuPuoIqunqo4beeQe22grWXx/+9jfo2bPoiMysWpTl\n6iNJF2Y/J0u6veFSrmAtWWcduPTSNFjeL3xboJkVoKU7mq/Nfp6fdyCWHHBAWs4+Gw46CDbbrOiI\nzKwrabamEBEzs9XtImJa6QJsl394XdMll0CvXqmPoZO17plZJ9fajuYjGnnsyDLGYSUGD4bzzoP7\n74drrik6GjPrSlrqUzhY0mRg0wb9CbXAm5UJsWv69rfTFUmnnAKvvlp0NGbWVbTUp/Aw8DIwEPif\nksffA/6RV1AG3bql6Tu33RZOPhkmTiw6IjPrClrqU3gReABY3KBP4e8R4XnDcrbllnDGGXDjjXDn\nnUVHY2ZdQYt9ChGxHFghaZ0KxGMN/OAHsPXWad6F994rOhozq3at7Wh+H/inpF9LuqhuyTMwS3r1\nSkNgzJsHp59edDRmVu1aO/Na3fAWVoAvfAFOPDFdqnrIIbDjjkVHZGbVqtXDXEjqBWyRbT4dER/l\nFlUzqnmYi+a8916avrN/f5g5M9UgzMxaq9yT7OwG/Au4FLgMeEbSrqsVobVJv35w+eXwxBPpHgYz\nszy0tk/hf4AvR8QXI2JXYC/ggvzCssbsuy+MGQM/+Qk89VTR0ZhZNWptUugZEU/XbUTEM0CLY3hK\nGiXpaUlzJJ3axD5jJD0paZak37cyni7roougb1849lhYsaLoaMys2rQ2KcyQdJWk3bLlSqDZhn1J\n3UnNTXsDw4GDJQ1vsM/mwGnAThGxNfCdNp9BF7P++nD++fDAA3DVVUVHY2bVprVJ4QTgSeCkbHkS\nOL6F1+wAzImI5yJiKTAR2K/BPt8GLo2ItwAi4rXWBt6VHXUUjBwJEybAggVFR2Nm1aS1SeH4iPhF\nRByYLReQEkVzhgBzS7bnZY+V2gLYQtJDkv4iaVRjB5J0rKQZkmYsXLiwlSFXLwl+9StYsgTGjy86\nGjOrJkWPktoD2BzYDTgYuFJS/4Y7RcQVEVETETWDBg0qQ7Gd3+abw49+BLfeCn/4Q9HRmFm1aO0o\nqZs0GCV1Ki2PkjofGFqyvWH2WKl5wO0R8VFEPA88Q0oS1gqnnJIGzDvxxDSVp5nZ6spzlNTpwOaS\nNiElg7HAIQ32+QOphvAbSQNJzUnPtS5069kzdTZ//vNw6qnpPgYzs9XR4iipETEV2BN4IJtx7WXS\nt3618NplwDhgCjAbmBQRsySdI2l0ttsU4A1JTwK1wH9ExBurc0JdTU1NGlr7l7+EBx8sOhoz6+xa\nNcyFpJnALsC6wEOkWsDSiDg03/A+rqsOc9GcRYtgm22gd2947DFYY42iIzKzjqasw1yQkscHwIHA\nZRHxdWDr1QnQyqdv31RTeOop+NnPio7GzDqzVicFSV8ADgXqpnvpnk9I1h577QWHHgrnnguzZhUd\njZl1Vq1NCt8h3Xn8f1m/wKakPgDrQC64ANZeO83v7CEwzKw9WpUUsik4R0fEz7Pt5yLipHxDs7Ya\nNCglhkce8ZVIZtY+zV6SKunCiPhOdq/Cx3qkI2J0Iy+zAh12GFx3HZx2GoweDUOHtvwaM7M6Ld2n\ncG328/y8A7HykGD77WHatHRT2223pcdqa2H69DRekplZU5pNChExM/s5TdKgbN2DD3Vwe+2Vpu6c\nPBluvhkGDkzzMEyaVHRkZtbRtdinIOlsSa8DT5NmXFso6az8Q7P2GjkyjYfUowcccQQcdFBKCCNH\nFh2ZmXV0LY199D1gJ2BERKwXEesCnwd2kvTdSgRo7bPnnnDccfDhh7BsGWyySdERmVln0FJN4ZvA\nwdlgdUC68gg4DDg8z8Bs9dTWwo03phna3n8fdtoJXn656KjMrKNrKSn0jIjXGz6Y9Su0OB2nFaO2\ndmUfwq9+BRdfnCbj+fd/hzdbGtvWzLq0lpLC0nY+ZwWaPn3VPoQTT0xTeM6dC3vvDe+9V2x8ZtZx\nNTsgnqTlwKLGngJ6R0TFawseEK/9br8dDjwQdtkF7roL+vQpOiIzq5SyDIgXEd0jYu1Gln5FJARb\nPaNHw29/m+5hGDMGPvqo6IjMrKNp7dhHViUOPRQuuwzuuAMOPxyWLy86IjPrSFq6o9mq0PHHp+k7\nTz01DaD3y1+mu57NzJwUuqgf/CAlhnPPhXXWgZ//3InBzJwUurSf/hTefRf++79TYjj99KIjMrOi\nOSl0YRJcdFFKDGeckZqSxo8vOiozK5KTQhfXrRtcfXW6d+Gkk1JiOOKIoqMys6L46iOjRw+YODGN\nl3T00XDrrUVHZGZFcVIwANZYI42s+vnPw9ixcO+9RUdkZkVwUrB6ffumO52HD4f994eHHio6IjOr\nNCcFW0X//qmWMHQo7LMPPPpo0RGZWSU5KdjHfOIT8Mc/pgTx5S/DU08VHZGZVYqTgjVq6NCUGLp3\nTx3QL7xQdERmVgm5JgVJoyQ9LWmOpFMbef7IbHrPx7LlmDzjsbbZfPPUlLRoEXzpS/DKK0VHZGZ5\nyy0pSOoOXArsDQwHDpY0vJFdb4yI7bLlqrzisfb5zGfg7rvTrG1f+pIn6TGrdnnWFHYA5kTEcxGx\nFJgI7JdjeZaTHXdMczH861+epMes2uWZFIYAc0u252WPNXSQpH9IulnS0MYOJOlYSTMkzVi4cGEe\nsVoLdt89zeY2cybstx8sXlx0RGaWh6I7micDwyLiM8B9wG8b2ykiroiImoioGTRoUEUDtJXqJumZ\nOtWT9JhVqzyTwnyg9Jv/htlj9SLijYhYkm1eBXwux3isDOom6Zk8OY2R5El6zKpLngPiTQc2l7QJ\nKRmMBQ4p3UHSJyPi5WxzNDA7x3isTEon6enXz5P0mFWT3JJCRCyTNA6YAnQHro6IWZLOAWZExO3A\nSZJGA8uAN4Ej84rHysuT9JhVp1yHzo6Iu4C7Gjx2Vsn6acBpecZg+fEkPWbVx/MpWLt5kh6z6uOk\nYKvFk/SYVZeiL0m1KlA6Sc9RR8GPf7zq87W1cN55xcRmZm3jpGBlUTdJz1Zbwdlnp34GSAlhzBgY\nMaLQ8Mysldx8ZGXTt2+amKemBiZMgFmz4M47053QI0cWHZ2ZtYZrClZW/fvDww/DgAHp7uf99nNC\nMOtMnBSs7GbNgoiUIK6+Gq7y2LdmnYaTgpVVXR/CzTenqTwHDIBjj4Xf/KboyMysNZwUrKymT1/Z\nhzBsWGpK6t8fTj4Z5swpOjoza4mTgpXVhAmr9iFsvjk8+GC6OmmPPeDFF4uLzcxa5qRguRs+HO67\nL935vPvuMH9+y68xs2I4KVhFbLcdTJkCCxemm9xee63oiMysMU4KVjE77JDuW3jpJc/3bNZROSlY\nRe2yC9x2Gzz9NOy1Vxp+28w6DicFq7g994RbboHHH4d99oH33y86IjOr46RghfjKV+CGG+Cvf01z\nP3/4YdERmRk4KViBDjooDYUxdSoceCAsWdLiS8wsZ04KVqhDD4Urr4R77oGxY+Gjj4qOyKxrc1Kw\nwn3rW3DxxWno7cMPh+XLi47IrOvy0NnWIYwbl/oVJkyA3r3h179Os7qZWWU5KViH8R//AR98kCbp\n6dMHLr00zQNtZpXjpGAdyllnpRrDz3+eagz/8z9ODGaV5KRgHYoE556bEsMFF6TZ3H7yk6KjMus6\nnBSsw5HgwgtTYvjP/0xNST/8YdFRmXUNTgrWIUlw+eUpMZx+ekoM3/1u0VGZVT8nBeuwundPM7Yt\nXgzf+15KDMcfX3RUZtUt14v+JI2S9LSkOZJObWa/gySFpJo847HOp0cPuP562HdfOOGEdAe0meUn\nt6QgqTtwKbA3MBw4WNLwRvbrB5wM/DWvWKxz69ULbropDbd99NFw441FR2RWvfKsKewAzImI5yJi\nKTAR2K+R/X4C/BxYnGMs1sn17p3ueN555zQ0xh/+UHREZtUpz6QwBJhbsj0ve6yepM8CQyPizuYO\nJOlYSTMkzVi4cGH5I7VOYc014Y47oKYGvvGNNF6SmZVXYQMJSOoG/AI4paV9I+KKiKiJiJpBgwbl\nH5x1WP36pWSw9dZwwAFQW1t0RGbVJc+kMB8YWrK9YfZYnX7ANsBUSS8AOwK3u7PZWtK/P9x7L2y6\nKXz1q/Dww0VHZFY98kwK04HNJW0iqRcwFri97smIeCciBkbEsIgYBvwFGB0RM3KMyarEwIHwpz/B\n4MGw994ww381ZmWRW1KIiGXAOGAKMBuYFBGzJJ0jaXRe5VrXscEGKTGstx58+cvwj38UHZFZ56eI\nKDqGNqmpqYkZ/lpoJZ5/HrbbLt0F/Ze/wJZbpsdra2H69DQct1lXJ2lmRLTYPO8R663T22STNEnP\nu++mS1YffRSmTIExY2DEiKI+j8n7AAAJMUlEQVSjM+tcPMyFVYXDD4dly+CYY+Czn02PrblmGhZj\n/fXTssEGK9cbbvfu3b5yzzsvJZ6RI1c+5hqKdWZOClY1jj4aHnss1RpGjoRttoFXX03LP/8Jf/wj\nvP12469dZ52mE0bDZFKaQEaMSDWSSZNSmbW1K7fNOiMnBasatbVwww1w5plphNUzz1z1GzzAkiUr\nE0Xd8sorq663JYFssEFqstp331TWAw/Aj3+cro5asCB1gre3FmJWBHc0W1Uo/Ybe8Bt7w8TQWosX\nw2uvNZ48Gm43lUAgNWOttx4MGND6n+uum8Z8ao6brqwtWtvR7JqCVYXp01dNACNHpu3p09ufFHr3\nho02Sktz6hLQoYemUVzPOCO95s034Y03Pv5z1qyV28uWNX3cfv2aTx5vvQX77w8//SnstRc88wwc\neaSbrmz1uKZgthpWp4YSAe+913TyaOrnW2/BihWNH7NXr3RDX3N9InXba63VvvmvXUPpnFxTMKuA\n1amhSLD22mkZNqz1Za5YAe+8szJJXHhh6kvZYw/YdtuVTVrPPZeGAHn99ZSAGurTp3XJo2ECced6\ndXNNwawTq/tAPuGE1LneWA1l2bKUGBp2qDfWP9JUAllzzVUTxvLlcP/9sOuu8NBDacrUkSNTX8i6\n66bxqXrk8JXTtZT2c03BrMo1bKoaObLxpqsePdK3/g02aPmYy5bBwoXNX5317LNp/YMPVg5ffmoj\n8yquvXZKEOuttzJZtGZ97bWhWxO31bqWkj8nBbNOKo/O9R494JOfTEtz6j6Mv/lNuOaadBnuppum\nJq233kpLw/Unn1y5vnRp08fu1i3VNJpKHAcdBKNHp871++5LZa+9Njz1FPTtm5q6+vZt+eqttuhK\nNRQ3H5lZm6zu5b8R8OGHTSePxtZLt5cvb12cPXqsmiSaW2/puSeegJNPTleX7bMPTJu2+pc8V1pr\nm4+cFMysTYr81hwBd92VhjUZMwYmTkyXAG+xBbz/PixalJbS9Ybbja0vbsNkwFJattgiDb44ZMiq\ny+DB6efaa+f3PrSHk4KZVZ08blKEVPtoKomUbt9yS7rbfdtt4ROfSHetz5/f+M2La6318UTRMHls\nsAH07Nl8bOVKwu5oNrOqk0c/CkD37isvD25KbW0aW6tuGJULLlhZ5qJFKxNE3c+6ZcGCNPzJggXw\n0UerHlNKV3M1lzy23LKyneuuKZiZtaAcNZQVK9Ilv40ljtIE8sYbH39tr17pyrD994c//7l9NSPX\nFMzMyqQcNZRu3VKT0yc+Adtv3/R+ixc3Xuu45x649dbGB3osJ9cUzMw6uNbcpNgSz7xmZlYFSpuq\nzjkn/RwzJj2eBycFM7MOrLmmqzy4+cjMrAtw85GZmbWZk4KZmdVzUjAzs3pOCmZmVs9JwczM6nW6\nq48kLQRebOfLBwKvlzGczlC2z7n6yy2ybJ9z5yl744gY1NJOnS4prA5JM1pzSVY1le1zrv5yiyzb\n51x9Zbv5yMzM6jkpmJlZva6WFK7ogmX7nKu/3CLL9jlXWdldqk/BzMya19VqCmZm1gwnBTMzq9cl\nkoKkqyW9JumJCpc7VFKtpCclzZJ0cgXL7i3pb5Iez8r+caXKzsrvLulRSXdUuNwXJP1T0mOSKjac\nrqT+km6W9JSk2ZK+UKFyP5Wda93yrqTvVKjs72Z/W09IukFS7wqVe3JW5qy8z7Wxzw5J60m6T9K/\nsp/rVqjcr2fnvEJSbpeldomkAFwDjCqg3GXAKRExHNgROFHS8AqVvQTYPSK2BbYDRknasUJlA5wM\nzK5geaVGRsR2Fb6W/H+BeyJiS2BbKnTuEfF0dq7bAZ8DPgD+L+9yJQ0BTgJqImIboDswtgLlbgN8\nG9iB9D7vK2mzHIu8ho9/dpwK/CkiNgf+lG1XotwngAOBP+dQXr0ukRQi4s/AmwWU+3JE/D1bf4/0\nQTGkQmVHRLyfbfbMlopcVSBpQ+ArwFWVKK9oktYBdgV+DRARSyPi7QJC2QN4NiLae8d/W/UA+kjq\nAawJLKhAmVsBf42IDyJiGTCN9EGZiyY+O/YDfput/xbYvxLlRsTsiHi63GU11CWSQkcgaRiwPfDX\nCpbZXdJjwGvAfRFRqbIvBCYAKypUXqkA7pU0U9KxFSpzE2Ah8JusyewqSX0rVHapscANlSgoIuYD\n5wMvAS8D70TEvRUo+glgF0kDJK0J7AMMrUC5pdaPiJez9VeA9Stcfq6cFCpA0lrALcB3IuLdSpUb\nEcuzZoUNgR2yqneuJO0LvBYRM/Muqwk7R8Rngb1JzXW7VqDMHsBngcsjYntgEfk0KTRJUi9gNHBT\nhcpbl/SNeRNgMNBX0mF5lxsRs4GfA/cC9wCPAcvzLreZeIIK1cArxUkhZ5J6khLC9RFxaxExZE0Z\ntVSmX2UnYLSkF4CJwO6SrqtAuUD9N1gi4jVS2/oOFSh2HjCvpCZ2MylJVNLewN8j4tUKlbcn8HxE\nLIyIj4BbgX+vRMER8euI+FxE7Aq8BTxTiXJLvCrpkwDZz9cqXH6unBRyJEmkdubZEfGLCpc9SFL/\nbL0P8CXgqbzLjYjTImLDiBhGas64PyJy/wYJIKmvpH5168CXSc0NuYqIV4C5kj6VPbQH8GTe5TZw\nMBVqOsq8BOwoac3s73wPKtS5LukT2c+NSP0Jv69EuSVuB47I1o8Abqtw+fmKiKpfSP8sLwMfkb7V\nfatC5e5Mqlr+g1TNfQzYp0JlfwZ4NCv7CeCsAt733YA7KljepsDj2TILOL2CZW8HzMje7z8A61aw\n7L7AG8A6Ff79/pj0ReMJ4FpgjQqV+wAp6T4O7JFzWR/77AAGkK46+hfwR2C9CpV7QLa+BHgVmJLH\nOXuYCzMzq+fmIzMzq+ekYGZm9ZwUzMysnpOCmZnVc1IwM7N6Tgpmq0nSsEqPwGuWFycFMzOr56Rg\nVkaSNs0GxRtRdCxm7dGj6ADMqkU2zMVE4MiIeLzoeMzaw0nBrDwGkcbAOTAiKj3ukVnZuPnIrDze\nIQ0St3PRgZitDtcUzMpjKWnAsimS3o+ISo/caVYWTgpmZRIRi7JJhu7LEsPtRcdk1lYeJdXMzOq5\nT8HMzOo5KZiZWT0nBTMzq+ekYGZm9ZwUzMysnpOCmZnVc1IwM7N6/x/T5ZZezY2qsAAAAABJRU5E\nrkJggg==\n",
      "text/plain": [
       "<matplotlib.figure.Figure at 0x7f72368de630>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "plt.clf()\n",
    "\n",
    "colors = ['b', 'g', 'r']\n",
    "markers = ['o', 'v', 's']\n",
    "\n",
    "# k means determine k\n",
    "distortions = []\n",
    "K = range(1,12)\n",
    "for k in K:\n",
    "    kmeanModel = KMeans(n_clusters=k, random_state=42, init=centroids[:k,:],max_iter=10,n_init=1).fit(X)\n",
    "    distortions.append(sum(np.min(cdist(X, kmeanModel.cluster_centers_, 'euclidean'), axis=1)) / X.shape[0])\n",
    "\n",
    "# Plot the elbow\n",
    "plt.plot(K, distortions, 'bx-')\n",
    "plt.xlabel('k')\n",
    "plt.ylabel('Distortion')\n",
    "plt.title('The Elbow Method showing the optimal k')\n",
    "plt.xticks(np.arange(1,12,1))\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### q2"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### q3"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Genero</th>\n",
       "      <th>Idade</th>\n",
       "      <th>Escolaridade</th>\n",
       "      <th>Profissao</th>\n",
       "      <th>Target</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>F</td>\n",
       "      <td>a - Ate 25 anos</td>\n",
       "      <td>Fundamental</td>\n",
       "      <td>b</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>M</td>\n",
       "      <td>a - Ate 25 anos</td>\n",
       "      <td>Medio</td>\n",
       "      <td>d</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>F</td>\n",
       "      <td>c - 36 a 45 anos</td>\n",
       "      <td>Fundamental</td>\n",
       "      <td>a</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>M</td>\n",
       "      <td>d - 46 a 55 anos</td>\n",
       "      <td>Fundamental</td>\n",
       "      <td>a</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>F</td>\n",
       "      <td>c - 36 a 45 anos</td>\n",
       "      <td>Fundamental</td>\n",
       "      <td>b</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  Genero             Idade Escolaridade Profissao  Target\n",
       "0      F  a - Ate 25 anos   Fundamental         b       0\n",
       "1      M  a - Ate 25 anos         Medio         d       1\n",
       "2      F  c - 36 a 45 anos  Fundamental         a       1\n",
       "3      M  d - 46 a 55 anos  Fundamental         a       0\n",
       "4      F  c - 36 a 45 anos  Fundamental         b       1"
      ]
     },
     "execution_count": 31,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_classif_q3 = pd.read_csv(root+\"classificacao_Q3.csv\")\n",
    "df_classif_q3.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 67,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "count     1000\n",
       "unique       2\n",
       "top          F\n",
       "freq       583\n",
       "Name: Genero, dtype: object"
      ]
     },
     "execution_count": 67,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_classif_q3['Genero'].astype('category').describe()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 68,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "count                 1000\n",
       "unique                   5\n",
       "top       b - 26 a 35 anos\n",
       "freq                   297\n",
       "Name: Idade, dtype: object"
      ]
     },
     "execution_count": 68,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_classif_q3['Idade'].astype('category').describe()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 69,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "count            1000\n",
       "unique              4\n",
       "top       Fundamental\n",
       "freq              501\n",
       "Name: Escolaridade, dtype: object"
      ]
     },
     "execution_count": 69,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_classif_q3['Escolaridade'].astype('category').describe()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 70,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "count     1000\n",
       "unique       5\n",
       "top          b\n",
       "freq       423\n",
       "Name: Profissao, dtype: object"
      ]
     },
     "execution_count": 70,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_classif_q3['Profissao'].astype('category').describe()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 71,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "df_classif_q3_ohe = df_classif_q3.copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 72,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "df_classif_q3_ohe = pd.concat([\n",
    "    df_classif_q3_ohe,pd.get_dummies(df_classif_q3_ohe['Genero'], prefix='Genero')\n",
    "],axis=1).drop(['Genero'],axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 73,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "df_classif_q3_ohe = pd.concat([\n",
    "    df_classif_q3_ohe,pd.get_dummies(df_classif_q3_ohe['Idade'], prefix='Idade')\n",
    "],axis=1).drop(['Idade'],axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 74,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "df_classif_q3_ohe = pd.concat([\n",
    "    df_classif_q3_ohe,pd.get_dummies(df_classif_q3_ohe['Escolaridade'], prefix='Escolaridade')\n",
    "],axis=1).drop(['Escolaridade'],axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 75,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "df_classif_q3_ohe = pd.concat([\n",
    "    df_classif_q3_ohe,pd.get_dummies(df_classif_q3_ohe['Profissao'], prefix='Profissao')\n",
    "],axis=1).drop(['Profissao'],axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 76,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Target</th>\n",
       "      <th>Genero_F</th>\n",
       "      <th>Genero_M</th>\n",
       "      <th>Idade_a - Ate 25 anos</th>\n",
       "      <th>Idade_b - 26 a 35 anos</th>\n",
       "      <th>Idade_c - 36 a 45 anos</th>\n",
       "      <th>Idade_d - 46 a 55 anos</th>\n",
       "      <th>Idade_e - Mais 56 anos</th>\n",
       "      <th>Escolaridade_Fundamental</th>\n",
       "      <th>Escolaridade_Medio</th>\n",
       "      <th>Escolaridade_Pos-graduacao</th>\n",
       "      <th>Escolaridade_Superior</th>\n",
       "      <th>Profissao_a</th>\n",
       "      <th>Profissao_b</th>\n",
       "      <th>Profissao_c</th>\n",
       "      <th>Profissao_d</th>\n",
       "      <th>Profissao_e</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   Target  Genero_F  Genero_M  Idade_a - Ate 25 anos   Idade_b - 26 a 35 anos  \\\n",
       "0       0         1         0                       1                       0   \n",
       "1       1         0         1                       1                       0   \n",
       "2       1         1         0                       0                       0   \n",
       "3       0         0         1                       0                       0   \n",
       "4       1         1         0                       0                       0   \n",
       "\n",
       "   Idade_c - 36 a 45 anos  Idade_d - 46 a 55 anos  Idade_e - Mais 56 anos  \\\n",
       "0                       0                       0                       0   \n",
       "1                       0                       0                       0   \n",
       "2                       1                       0                       0   \n",
       "3                       0                       1                       0   \n",
       "4                       1                       0                       0   \n",
       "\n",
       "   Escolaridade_Fundamental  Escolaridade_Medio  Escolaridade_Pos-graduacao  \\\n",
       "0                         1                   0                           0   \n",
       "1                         0                   1                           0   \n",
       "2                         1                   0                           0   \n",
       "3                         1                   0                           0   \n",
       "4                         1                   0                           0   \n",
       "\n",
       "   Escolaridade_Superior  Profissao_a  Profissao_b  Profissao_c  Profissao_d  \\\n",
       "0                      0            0            1            0            0   \n",
       "1                      0            0            0            0            1   \n",
       "2                      0            1            0            0            0   \n",
       "3                      0            1            0            0            0   \n",
       "4                      0            0            1            0            0   \n",
       "\n",
       "   Profissao_e  \n",
       "0            0  \n",
       "1            0  \n",
       "2            0  \n",
       "3            0  \n",
       "4            0  "
      ]
     },
     "execution_count": 76,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_classif_q3_ohe.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 95,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.772"
      ]
     },
     "execution_count": 95,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_classif_q3_ohe['Target'].mean()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 80,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1000"
      ]
     },
     "execution_count": 80,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(df_classif_q3_ohe)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 83,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "((500, 16), (500, 16), (500, 1), (500, 1))"
      ]
     },
     "execution_count": 83,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "X_train = df_classif_q3_ohe.drop(['Target'],axis=1).values[:500,:]\n",
    "X_test = df_classif_q3_ohe.drop(['Target'],axis=1).values[500:,:]\n",
    "y_train = df_classif_q3_ohe[['Target']].values[:500,:]\n",
    "y_test = df_classif_q3_ohe[['Target']].values[500:,:]\n",
    "\n",
    "X_train.shape,X_test.shape,y_train.shape,y_test.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 85,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "GaussianNB(priors=None)"
      ]
     },
     "execution_count": 85,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "clf = GaussianNB()\n",
    "clf.fit(X_train,y_train.ravel())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 93,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.604"
      ]
     },
     "execution_count": 93,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "preds_train = clf.predict(X_train)\n",
    "metrics.accuracy_score(y_train,preds_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 94,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.546"
      ]
     },
     "execution_count": 94,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "preds_test = clf.predict(X_test)\n",
    "metrics.accuracy_score(y_test,preds_test)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### q4"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>x0</th>\n",
       "      <th>x1</th>\n",
       "      <th>x2</th>\n",
       "      <th>x3</th>\n",
       "      <th>x4</th>\n",
       "      <th>x5</th>\n",
       "      <th>x6</th>\n",
       "      <th>x7</th>\n",
       "      <th>x8</th>\n",
       "      <th>x9</th>\n",
       "      <th>...</th>\n",
       "      <th>x91</th>\n",
       "      <th>x92</th>\n",
       "      <th>x93</th>\n",
       "      <th>x94</th>\n",
       "      <th>x95</th>\n",
       "      <th>x96</th>\n",
       "      <th>x97</th>\n",
       "      <th>x98</th>\n",
       "      <th>x99</th>\n",
       "      <th>target</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1.696199</td>\n",
       "      <td>-0.792598</td>\n",
       "      <td>-0.349427</td>\n",
       "      <td>-0.464560</td>\n",
       "      <td>3.187014</td>\n",
       "      <td>0.035976</td>\n",
       "      <td>1.033274</td>\n",
       "      <td>-1.504968</td>\n",
       "      <td>0.204693</td>\n",
       "      <td>1.691204</td>\n",
       "      <td>...</td>\n",
       "      <td>1.488142</td>\n",
       "      <td>-0.686337</td>\n",
       "      <td>2.084970</td>\n",
       "      <td>-0.685140</td>\n",
       "      <td>-2.049451</td>\n",
       "      <td>2.015426</td>\n",
       "      <td>1.158477</td>\n",
       "      <td>-0.309441</td>\n",
       "      <td>-1.549833</td>\n",
       "      <td>4.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>-0.236696</td>\n",
       "      <td>-2.202342</td>\n",
       "      <td>0.024023</td>\n",
       "      <td>1.497700</td>\n",
       "      <td>-0.069758</td>\n",
       "      <td>-2.467088</td>\n",
       "      <td>1.126529</td>\n",
       "      <td>-0.570557</td>\n",
       "      <td>2.079251</td>\n",
       "      <td>-1.882632</td>\n",
       "      <td>...</td>\n",
       "      <td>0.405567</td>\n",
       "      <td>0.509564</td>\n",
       "      <td>1.374071</td>\n",
       "      <td>-0.016943</td>\n",
       "      <td>-0.429280</td>\n",
       "      <td>-0.895016</td>\n",
       "      <td>1.259566</td>\n",
       "      <td>-0.354139</td>\n",
       "      <td>0.806797</td>\n",
       "      <td>5.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>-0.436683</td>\n",
       "      <td>1.563816</td>\n",
       "      <td>-0.895999</td>\n",
       "      <td>-0.580425</td>\n",
       "      <td>0.311060</td>\n",
       "      <td>-0.187369</td>\n",
       "      <td>0.805249</td>\n",
       "      <td>-2.399522</td>\n",
       "      <td>-0.578818</td>\n",
       "      <td>1.586981</td>\n",
       "      <td>...</td>\n",
       "      <td>0.933578</td>\n",
       "      <td>-1.285978</td>\n",
       "      <td>0.503162</td>\n",
       "      <td>0.204829</td>\n",
       "      <td>-0.753835</td>\n",
       "      <td>0.290033</td>\n",
       "      <td>1.721487</td>\n",
       "      <td>1.304518</td>\n",
       "      <td>0.478903</td>\n",
       "      <td>3.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1.425908</td>\n",
       "      <td>0.400055</td>\n",
       "      <td>-0.305038</td>\n",
       "      <td>-0.930251</td>\n",
       "      <td>-2.214549</td>\n",
       "      <td>1.763379</td>\n",
       "      <td>-0.239868</td>\n",
       "      <td>-2.058891</td>\n",
       "      <td>-1.006533</td>\n",
       "      <td>-2.156839</td>\n",
       "      <td>...</td>\n",
       "      <td>-0.849927</td>\n",
       "      <td>1.402768</td>\n",
       "      <td>0.393653</td>\n",
       "      <td>-1.466818</td>\n",
       "      <td>0.152257</td>\n",
       "      <td>-4.004950</td>\n",
       "      <td>0.676342</td>\n",
       "      <td>-1.927319</td>\n",
       "      <td>1.959032</td>\n",
       "      <td>8.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>-0.186156</td>\n",
       "      <td>-0.975764</td>\n",
       "      <td>0.594660</td>\n",
       "      <td>-1.181980</td>\n",
       "      <td>-1.443414</td>\n",
       "      <td>-0.797651</td>\n",
       "      <td>-1.252608</td>\n",
       "      <td>-0.060452</td>\n",
       "      <td>0.130702</td>\n",
       "      <td>-2.343517</td>\n",
       "      <td>...</td>\n",
       "      <td>-1.444435</td>\n",
       "      <td>-1.818126</td>\n",
       "      <td>0.446574</td>\n",
       "      <td>0.239328</td>\n",
       "      <td>0.802939</td>\n",
       "      <td>-2.035289</td>\n",
       "      <td>-1.433793</td>\n",
       "      <td>-0.218596</td>\n",
       "      <td>0.619317</td>\n",
       "      <td>9.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 101 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "         x0        x1        x2        x3        x4        x5        x6  \\\n",
       "0  1.696199 -0.792598 -0.349427 -0.464560  3.187014  0.035976  1.033274   \n",
       "1 -0.236696 -2.202342  0.024023  1.497700 -0.069758 -2.467088  1.126529   \n",
       "2 -0.436683  1.563816 -0.895999 -0.580425  0.311060 -0.187369  0.805249   \n",
       "3  1.425908  0.400055 -0.305038 -0.930251 -2.214549  1.763379 -0.239868   \n",
       "4 -0.186156 -0.975764  0.594660 -1.181980 -1.443414 -0.797651 -1.252608   \n",
       "\n",
       "         x7        x8        x9   ...         x91       x92       x93  \\\n",
       "0 -1.504968  0.204693  1.691204   ...    1.488142 -0.686337  2.084970   \n",
       "1 -0.570557  2.079251 -1.882632   ...    0.405567  0.509564  1.374071   \n",
       "2 -2.399522 -0.578818  1.586981   ...    0.933578 -1.285978  0.503162   \n",
       "3 -2.058891 -1.006533 -2.156839   ...   -0.849927  1.402768  0.393653   \n",
       "4 -0.060452  0.130702 -2.343517   ...   -1.444435 -1.818126  0.446574   \n",
       "\n",
       "        x94       x95       x96       x97       x98       x99  target  \n",
       "0 -0.685140 -2.049451  2.015426  1.158477 -0.309441 -1.549833     4.0  \n",
       "1 -0.016943 -0.429280 -0.895016  1.259566 -0.354139  0.806797     5.0  \n",
       "2  0.204829 -0.753835  0.290033  1.721487  1.304518  0.478903     3.0  \n",
       "3 -1.466818  0.152257 -4.004950  0.676342 -1.927319  1.959032     8.0  \n",
       "4  0.239328  0.802939 -2.035289 -1.433793 -0.218596  0.619317     9.0  \n",
       "\n",
       "[5 rows x 101 columns]"
      ]
     },
     "execution_count": 32,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_classif_q4 = pd.read_csv(root+\"classificacao_Q4.csv\")\n",
    "df_classif_q4.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 107,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1500"
      ]
     },
     "execution_count": 107,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(df_classif_q4)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 101,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "data = df_classif_q4.drop(['target'],axis=1).values\n",
    "target = df_classif_q4['target'].values"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 102,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 140,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.6866666666666666\n",
      "0.66\n",
      "0.62\n",
      "0.6533333333333333\n",
      "0.6466666666666666\n",
      "0.6666666666666666\n",
      "0.6466666666666666\n",
      "0.6533333333333333\n",
      "0.68\n",
      "0.6066666666666667\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "0.6519999999999999"
      ]
     },
     "execution_count": 140,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "accs=[]\n",
    "\n",
    "kf = KFold(n_splits=10)\n",
    "\n",
    "for train_index, test_index in kf.split(data):\n",
    "#     print(\"TRAIN:\", train_index, \"TEST:\", test_index)\n",
    "    X_train, X_test = data[train_index], data[test_index]\n",
    "    y_train, y_test = target[train_index], target[test_index]\n",
    "\n",
    "    knn = KNeighborsClassifier(n_neighbors=15,metric='euclidean')\n",
    "    knn.fit(X_train,y_train)\n",
    "    \n",
    "    preds = knn.predict(X_test)\n",
    "    \n",
    "    acc = metrics.accuracy_score(y_test,preds)\n",
    "    accs.append(acc)\n",
    "    print(acc)\n",
    "\n",
    "np.mean(accs)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### q5"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### q6"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>x0</th>\n",
       "      <th>x1</th>\n",
       "      <th>x2</th>\n",
       "      <th>x3</th>\n",
       "      <th>x4</th>\n",
       "      <th>x5</th>\n",
       "      <th>x6</th>\n",
       "      <th>x7</th>\n",
       "      <th>x8</th>\n",
       "      <th>x9</th>\n",
       "      <th>target</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>-0.351837</td>\n",
       "      <td>0.719462</td>\n",
       "      <td>0.862522</td>\n",
       "      <td>-1.131049</td>\n",
       "      <td>0.744903</td>\n",
       "      <td>0.532177</td>\n",
       "      <td>1.595572</td>\n",
       "      <td>0.607174</td>\n",
       "      <td>1.306696</td>\n",
       "      <td>0.450022</td>\n",
       "      <td>170.471696</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>-0.309565</td>\n",
       "      <td>0.786033</td>\n",
       "      <td>1.175365</td>\n",
       "      <td>-1.149287</td>\n",
       "      <td>0.318301</td>\n",
       "      <td>0.620987</td>\n",
       "      <td>1.470917</td>\n",
       "      <td>0.568968</td>\n",
       "      <td>1.334661</td>\n",
       "      <td>-0.267235</td>\n",
       "      <td>151.023999</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>-1.808729</td>\n",
       "      <td>-0.581660</td>\n",
       "      <td>-0.052200</td>\n",
       "      <td>-2.411602</td>\n",
       "      <td>0.171517</td>\n",
       "      <td>-1.174627</td>\n",
       "      <td>0.010317</td>\n",
       "      <td>-0.677372</td>\n",
       "      <td>0.841643</td>\n",
       "      <td>-1.186626</td>\n",
       "      <td>-268.606299</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>-0.386627</td>\n",
       "      <td>0.991724</td>\n",
       "      <td>0.234672</td>\n",
       "      <td>-0.431101</td>\n",
       "      <td>-0.160947</td>\n",
       "      <td>0.521185</td>\n",
       "      <td>1.957080</td>\n",
       "      <td>0.651099</td>\n",
       "      <td>0.933481</td>\n",
       "      <td>1.280705</td>\n",
       "      <td>180.737656</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>-0.879158</td>\n",
       "      <td>0.106556</td>\n",
       "      <td>0.775418</td>\n",
       "      <td>-1.046543</td>\n",
       "      <td>0.097267</td>\n",
       "      <td>-0.757026</td>\n",
       "      <td>1.050900</td>\n",
       "      <td>-0.262257</td>\n",
       "      <td>1.383563</td>\n",
       "      <td>-1.152509</td>\n",
       "      <td>-38.200673</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "         x0        x1        x2        x3        x4        x5        x6  \\\n",
       "0 -0.351837  0.719462  0.862522 -1.131049  0.744903  0.532177  1.595572   \n",
       "1 -0.309565  0.786033  1.175365 -1.149287  0.318301  0.620987  1.470917   \n",
       "2 -1.808729 -0.581660 -0.052200 -2.411602  0.171517 -1.174627  0.010317   \n",
       "3 -0.386627  0.991724  0.234672 -0.431101 -0.160947  0.521185  1.957080   \n",
       "4 -0.879158  0.106556  0.775418 -1.046543  0.097267 -0.757026  1.050900   \n",
       "\n",
       "         x7        x8        x9      target  \n",
       "0  0.607174  1.306696  0.450022  170.471696  \n",
       "1  0.568968  1.334661 -0.267235  151.023999  \n",
       "2 -0.677372  0.841643 -1.186626 -268.606299  \n",
       "3  0.651099  0.933481  1.280705  180.737656  \n",
       "4 -0.262257  1.383563 -1.152509  -38.200673  "
      ]
     },
     "execution_count": 33,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_regr_q6 = pd.read_csv(root+\"regressao_Q6.csv\")\n",
    "df_regr_q6.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 115,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1100"
      ]
     },
     "execution_count": 115,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(df_regr_q6)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 137,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(array([-0.35,  0.72,  0.86, -1.13,  0.74,  0.53,  1.6 ,  0.61,  1.31,\n",
       "         0.45]), 170.47169558790597)"
      ]
     },
     "execution_count": 137,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "X = df_regr_q6.drop(['target'],axis=1).values\n",
    "y = df_regr_q6['target'].values\n",
    "\n",
    "X[0],y[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 138,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "22.058797611970366\n",
      "27.431741940951238\n"
     ]
    }
   ],
   "source": [
    "loo = LeaveOneOut()\n",
    "\n",
    "test_errs = []\n",
    "train_errs = []\n",
    "\n",
    "for train_index, test_index in loo.split(X):\n",
    "    X_train, X_test = X[train_index], X[test_index]\n",
    "    y_train, y_test = y[train_index], y[test_index]\n",
    "    \n",
    "    clf = Ridge(alpha=1.7)\n",
    "    clf.fit(X_train,y_train)\n",
    "    \n",
    "    preds_test = clf.predict(X_test)\n",
    "    preds_train = clf.predict(X_train)\n",
    "    \n",
    "    test_errs.append(sqrt(metrics.mean_squared_error(y_test,preds_test)))\n",
    "    train_errs.append(sqrt(metrics.mean_squared_error(y_train,preds_train)))\n",
    "\n",
    "print(np.mean(test_errs))    \n",
    "print(np.mean(train_errs))\n",
    "#     print(X_train, X_test, y_train, y_test)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### q7"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>x0</th>\n",
       "      <th>x1</th>\n",
       "      <th>x2</th>\n",
       "      <th>x3</th>\n",
       "      <th>x4</th>\n",
       "      <th>x5</th>\n",
       "      <th>x6</th>\n",
       "      <th>x7</th>\n",
       "      <th>x8</th>\n",
       "      <th>x9</th>\n",
       "      <th>...</th>\n",
       "      <th>x11</th>\n",
       "      <th>x12</th>\n",
       "      <th>x13</th>\n",
       "      <th>x14</th>\n",
       "      <th>x15</th>\n",
       "      <th>x16</th>\n",
       "      <th>x17</th>\n",
       "      <th>x18</th>\n",
       "      <th>x19</th>\n",
       "      <th>target</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0.591690</td>\n",
       "      <td>0.591690</td>\n",
       "      <td>1.172172</td>\n",
       "      <td>-0.983677</td>\n",
       "      <td>-1.723861</td>\n",
       "      <td>-1.872732</td>\n",
       "      <td>1.070023</td>\n",
       "      <td>-0.623034</td>\n",
       "      <td>1.705102</td>\n",
       "      <td>2.643913</td>\n",
       "      <td>...</td>\n",
       "      <td>2.643913</td>\n",
       "      <td>-0.623034</td>\n",
       "      <td>2.643913</td>\n",
       "      <td>-1.191899</td>\n",
       "      <td>0.523268</td>\n",
       "      <td>1.599678</td>\n",
       "      <td>0.228014</td>\n",
       "      <td>0.636366</td>\n",
       "      <td>1.070023</td>\n",
       "      <td>183.381979</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0.617718</td>\n",
       "      <td>0.617718</td>\n",
       "      <td>1.066885</td>\n",
       "      <td>-0.994460</td>\n",
       "      <td>-1.660279</td>\n",
       "      <td>-2.006098</td>\n",
       "      <td>0.800897</td>\n",
       "      <td>-0.150230</td>\n",
       "      <td>1.601513</td>\n",
       "      <td>2.227607</td>\n",
       "      <td>...</td>\n",
       "      <td>2.227607</td>\n",
       "      <td>-0.150230</td>\n",
       "      <td>2.227607</td>\n",
       "      <td>-1.389668</td>\n",
       "      <td>0.946524</td>\n",
       "      <td>1.427928</td>\n",
       "      <td>0.327064</td>\n",
       "      <td>0.605663</td>\n",
       "      <td>0.800897</td>\n",
       "      <td>171.166244</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>-0.091250</td>\n",
       "      <td>-0.091250</td>\n",
       "      <td>0.460780</td>\n",
       "      <td>-2.076651</td>\n",
       "      <td>-2.903264</td>\n",
       "      <td>-3.110400</td>\n",
       "      <td>-0.361113</td>\n",
       "      <td>-2.029327</td>\n",
       "      <td>0.320967</td>\n",
       "      <td>1.514848</td>\n",
       "      <td>...</td>\n",
       "      <td>1.514848</td>\n",
       "      <td>-2.029327</td>\n",
       "      <td>1.514848</td>\n",
       "      <td>-2.855408</td>\n",
       "      <td>-0.856214</td>\n",
       "      <td>0.329120</td>\n",
       "      <td>-1.001805</td>\n",
       "      <td>-0.826144</td>\n",
       "      <td>-0.361113</td>\n",
       "      <td>-310.459284</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>-0.264834</td>\n",
       "      <td>-0.264834</td>\n",
       "      <td>1.061765</td>\n",
       "      <td>-0.539020</td>\n",
       "      <td>-0.972137</td>\n",
       "      <td>-1.102153</td>\n",
       "      <td>1.198790</td>\n",
       "      <td>-0.534910</td>\n",
       "      <td>2.272581</td>\n",
       "      <td>2.748045</td>\n",
       "      <td>...</td>\n",
       "      <td>2.748045</td>\n",
       "      <td>-0.534910</td>\n",
       "      <td>2.748045</td>\n",
       "      <td>-0.306130</td>\n",
       "      <td>1.134861</td>\n",
       "      <td>1.905080</td>\n",
       "      <td>0.998317</td>\n",
       "      <td>0.916976</td>\n",
       "      <td>1.198790</td>\n",
       "      <td>228.015980</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0.308724</td>\n",
       "      <td>0.308724</td>\n",
       "      <td>0.793572</td>\n",
       "      <td>-1.162892</td>\n",
       "      <td>-2.315907</td>\n",
       "      <td>-2.436588</td>\n",
       "      <td>0.038154</td>\n",
       "      <td>-1.335393</td>\n",
       "      <td>1.586416</td>\n",
       "      <td>1.898164</td>\n",
       "      <td>...</td>\n",
       "      <td>1.898164</td>\n",
       "      <td>-1.335393</td>\n",
       "      <td>1.898164</td>\n",
       "      <td>-2.222185</td>\n",
       "      <td>-0.049834</td>\n",
       "      <td>0.930992</td>\n",
       "      <td>0.196569</td>\n",
       "      <td>-0.607837</td>\n",
       "      <td>0.038154</td>\n",
       "      <td>-51.379428</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 21 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "         x0        x1        x2        x3        x4        x5        x6  \\\n",
       "0  0.591690  0.591690  1.172172 -0.983677 -1.723861 -1.872732  1.070023   \n",
       "1  0.617718  0.617718  1.066885 -0.994460 -1.660279 -2.006098  0.800897   \n",
       "2 -0.091250 -0.091250  0.460780 -2.076651 -2.903264 -3.110400 -0.361113   \n",
       "3 -0.264834 -0.264834  1.061765 -0.539020 -0.972137 -1.102153  1.198790   \n",
       "4  0.308724  0.308724  0.793572 -1.162892 -2.315907 -2.436588  0.038154   \n",
       "\n",
       "         x7        x8        x9     ...           x11       x12       x13  \\\n",
       "0 -0.623034  1.705102  2.643913     ...      2.643913 -0.623034  2.643913   \n",
       "1 -0.150230  1.601513  2.227607     ...      2.227607 -0.150230  2.227607   \n",
       "2 -2.029327  0.320967  1.514848     ...      1.514848 -2.029327  1.514848   \n",
       "3 -0.534910  2.272581  2.748045     ...      2.748045 -0.534910  2.748045   \n",
       "4 -1.335393  1.586416  1.898164     ...      1.898164 -1.335393  1.898164   \n",
       "\n",
       "        x14       x15       x16       x17       x18       x19      target  \n",
       "0 -1.191899  0.523268  1.599678  0.228014  0.636366  1.070023  183.381979  \n",
       "1 -1.389668  0.946524  1.427928  0.327064  0.605663  0.800897  171.166244  \n",
       "2 -2.855408 -0.856214  0.329120 -1.001805 -0.826144 -0.361113 -310.459284  \n",
       "3 -0.306130  1.134861  1.905080  0.998317  0.916976  1.198790  228.015980  \n",
       "4 -2.222185 -0.049834  0.930992  0.196569 -0.607837  0.038154  -51.379428  \n",
       "\n",
       "[5 rows x 21 columns]"
      ]
     },
     "execution_count": 34,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_regr_q7 = pd.read_csv(root+\"regressao_Q7.csv\")\n",
    "df_regr_q7.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 141,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "X = df_regr_q7.drop(['target'],axis=1).values\n",
    "y = df_regr_q7['target'].values"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 146,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(4.598442324179443e-08, 54.838118626002235)"
      ]
     },
     "execution_count": 146,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "kf = KFold(n_splits=10)\n",
    "\n",
    "accs_train = []\n",
    "accs_test = []\n",
    "\n",
    "for train_index, test_index in kf.split(data):\n",
    "#     print(\"TRAIN:\", train_index, \"TEST:\", test_index)\n",
    "    X_train, X_test = X[train_index], X[test_index]\n",
    "    y_train, y_test = y[train_index], y[test_index]\n",
    "\n",
    "    clf = DecisionTreeRegressor()\n",
    "    clf.fit(X_train,y_train)\n",
    "    \n",
    "    preds_train = clf.predict(X_train)\n",
    "    preds_test = clf.predict(X_test)\n",
    "    \n",
    "    accs_train.append(metrics.mean_absolute_error(y_train,preds_train))\n",
    "    accs_test.append(metrics.mean_absolute_error(y_test,preds_test))\n",
    "    \n",
    "np.mean(accs_train),np.mean(accs_test)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### q9"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 149,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Ks_2sampResult(statistic=0.6, pvalue=0.031046781145641363)"
      ]
     },
     "execution_count": 149,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "np.random.seed(42)\n",
    "\n",
    "sp.ks_2samp([5,3,3,11,8,7,1,5,4,9],[2,1,1,4,10,1,1,1,3,2])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 151,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.9425"
      ]
     },
     "execution_count": 151,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "p_bad = 0.15\n",
    "p_right_given_bad = 0.9\n",
    "p_right_given_good = 0.95\n",
    "\n",
    "p_right_given_bad*p_bad + p_right_given_good*(1-p_bad)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 153,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.9425"
      ]
     },
     "execution_count": 153,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "0.9*0.15+0.95*0.85"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Global TF Kernel (Python 3)",
   "language": "python",
   "name": "global-tf-python-3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
